{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional\n",
    "from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard\n",
    "from sklearn import preprocessing\n",
    "from sklearn.model_selection import train_test_split\n",
    "from yahoo_fin import stock_info as si\n",
    "from collections import deque\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set seed, so we can get the same results after rerunning several times\n",
    "np.random.seed(314)\n",
    "tf.random.set_seed(314)\n",
    "random.seed(314)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "from tensorflow.keras.layers import LSTM\n",
    "\n",
    "# Window size or the sequence length\n",
    "N_STEPS = 50\n",
    "# Lookup step, 1 is the next day\n",
    "LOOKUP_STEP = 15\n",
    "\n",
    "# whether to scale feature columns & output price as well\n",
    "SCALE = True\n",
    "scale_str = f\"sc-{int(SCALE)}\"\n",
    "# whether to shuffle the dataset\n",
    "SHUFFLE = True\n",
    "shuffle_str = f\"sh-{int(SHUFFLE)}\"\n",
    "# whether to split the training/testing set by date\n",
    "SPLIT_BY_DATE = False\n",
    "split_by_date_str = f\"sbd-{int(SPLIT_BY_DATE)}\"\n",
    "# test ratio size, 0.2 is 20%\n",
    "TEST_SIZE = 0.2\n",
    "# features to use\n",
    "FEATURE_COLUMNS = [\"adjclose\", \"volume\", \"open\", \"high\", \"low\"]\n",
    "# date now\n",
    "date_now = time.strftime(\"%Y-%m-%d\")\n",
    "\n",
    "### model parameters\n",
    "\n",
    "N_LAYERS = 2\n",
    "# LSTM cell\n",
    "CELL = LSTM\n",
    "# 256 LSTM neurons\n",
    "UNITS = 256\n",
    "# 40% dropout\n",
    "DROPOUT = 0.4\n",
    "# whether to use bidirectional RNNs\n",
    "BIDIRECTIONAL = False\n",
    "\n",
    "### training parameters\n",
    "\n",
    "# mean absolute error loss\n",
    "# LOSS = \"mae\"\n",
    "# huber loss\n",
    "LOSS = \"huber_loss\"\n",
    "OPTIMIZER = \"adam\"\n",
    "BATCH_SIZE = 64\n",
    "EPOCHS = 500\n",
    "\n",
    "# Amazon stock market\n",
    "ticker = \"AMZN\"\n",
    "ticker_data_filename = os.path.join(\"data\", f\"{ticker}_{date_now}.csv\")\n",
    "# model name to save, making it as unique as possible based on parameters\n",
    "model_name = f\"{date_now}_{ticker}-{shuffle_str}-{scale_str}-{split_by_date_str}-\\\n",
    "{LOSS}-{OPTIMIZER}-{CELL.__name__}-seq-{N_STEPS}-step-{LOOKUP_STEP}-layers-{N_LAYERS}-units-{UNITS}\"\n",
    "if BIDIRECTIONAL:\n",
    "    model_name += \"-b\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def shuffle_in_unison(a, b):\n",
    "    # shuffle two arrays in the same way\n",
    "    state = np.random.get_state()\n",
    "    np.random.shuffle(a)\n",
    "    np.random.set_state(state)\n",
    "    np.random.shuffle(b)\n",
    "\n",
    "\n",
    "def load_data(ticker, n_steps=50, scale=True, shuffle=True, lookup_step=1, split_by_date=True,\n",
    "                test_size=0.2, feature_columns=['adjclose', 'volume', 'open', 'high', 'low']):\n",
    "    \"\"\"\n",
    "    Loads data from Yahoo Finance source, as well as scaling, shuffling, normalizing and splitting.\n",
    "    Params:\n",
    "        ticker (str/pd.DataFrame): the ticker you want to load, examples include AAPL, TESL, etc.\n",
    "        n_steps (int): the historical sequence length (i.e window size) used to predict, default is 50\n",
    "        scale (bool): whether to scale prices from 0 to 1, default is True\n",
    "        shuffle (bool): whether to shuffle the dataset (both training & testing), default is True\n",
    "        lookup_step (int): the future lookup step to predict, default is 1 (e.g next day)\n",
    "        split_by_date (bool): whether we split the dataset into training/testing by date, setting it \n",
    "            to False will split datasets in a random way\n",
    "        test_size (float): ratio for test data, default is 0.2 (20% testing data)\n",
    "        feature_columns (list): the list of features to use to feed into the model, default is everything grabbed from yahoo_fin\n",
    "    \"\"\"\n",
    "    # see if ticker is already a loaded stock from yahoo finance\n",
    "    if isinstance(ticker, str):\n",
    "        # load it from yahoo_fin library\n",
    "        df = si.get_data(ticker)\n",
    "    elif isinstance(ticker, pd.DataFrame):\n",
    "        # already loaded, use it directly\n",
    "        df = ticker\n",
    "    else:\n",
    "        raise TypeError(\"ticker can be either a str or a `pd.DataFrame` instances\")\n",
    "\n",
    "    # this will contain all the elements we want to return from this function\n",
    "    result = {}\n",
    "    # we will also return the original dataframe itself\n",
    "    result['df'] = df.copy()\n",
    "\n",
    "    # make sure that the passed feature_columns exist in the dataframe\n",
    "    for col in feature_columns:\n",
    "        assert col in df.columns, f\"'{col}' does not exist in the dataframe.\"\n",
    "\n",
    "    # add date as a column\n",
    "    if \"date\" not in df.columns:\n",
    "        df[\"date\"] = df.index\n",
    "\n",
    "    if scale:\n",
    "        column_scaler = {}\n",
    "        # scale the data (prices) from 0 to 1\n",
    "        for column in feature_columns:\n",
    "            scaler = preprocessing.MinMaxScaler()\n",
    "            df[column] = scaler.fit_transform(np.expand_dims(df[column].values, axis=1))\n",
    "            column_scaler[column] = scaler\n",
    "\n",
    "        # add the MinMaxScaler instances to the result returned\n",
    "        result[\"column_scaler\"] = column_scaler\n",
    "\n",
    "    # add the target column (label) by shifting by `lookup_step`\n",
    "    df['future'] = df['adjclose'].shift(-lookup_step)\n",
    "\n",
    "    # last `lookup_step` columns contains NaN in future column\n",
    "    # get them before droping NaNs\n",
    "    last_sequence = np.array(df[feature_columns].tail(lookup_step))\n",
    "    \n",
    "    # drop NaNs\n",
    "    df.dropna(inplace=True)\n",
    "\n",
    "    sequence_data = []\n",
    "    sequences = deque(maxlen=n_steps)\n",
    "\n",
    "    for entry, target in zip(df[feature_columns + [\"date\"]].values, df['future'].values):\n",
    "        sequences.append(entry)\n",
    "        if len(sequences) == n_steps:\n",
    "            sequence_data.append([np.array(sequences), target])\n",
    "\n",
    "    # get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence\n",
    "    # for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 60 (that is 50+10) length\n",
    "    # this last_sequence will be used to predict future stock prices that are not available in the dataset\n",
    "    last_sequence = list([s[:len(feature_columns)] for s in sequences]) + list(last_sequence)\n",
    "    last_sequence = np.array(last_sequence).astype(np.float32)\n",
    "    # add to result\n",
    "    result['last_sequence'] = last_sequence\n",
    "    \n",
    "    # construct the X's and y's\n",
    "    X, y = [], []\n",
    "    for seq, target in sequence_data:\n",
    "        X.append(seq)\n",
    "        y.append(target)\n",
    "\n",
    "    # convert to numpy arrays\n",
    "    X = np.array(X)\n",
    "    y = np.array(y)\n",
    "\n",
    "    if split_by_date:\n",
    "        # split the dataset into training & testing sets by date (not randomly splitting)\n",
    "        train_samples = int((1 - test_size) * len(X))\n",
    "        result[\"X_train\"] = X[:train_samples]\n",
    "        result[\"y_train\"] = y[:train_samples]\n",
    "        result[\"X_test\"]  = X[train_samples:]\n",
    "        result[\"y_test\"]  = y[train_samples:]\n",
    "        if shuffle:\n",
    "            # shuffle the datasets for training (if shuffle parameter is set)\n",
    "            shuffle_in_unison(result[\"X_train\"], result[\"y_train\"])\n",
    "            shuffle_in_unison(result[\"X_test\"], result[\"y_test\"])\n",
    "    else:    \n",
    "        # split the dataset randomly\n",
    "        result[\"X_train\"], result[\"X_test\"], result[\"y_train\"], result[\"y_test\"] = train_test_split(X, y, \n",
    "                                                                                test_size=test_size, shuffle=shuffle)\n",
    "\n",
    "    # get the list of test set dates\n",
    "    dates = result[\"X_test\"][:, -1, -1]\n",
    "    # retrieve test features from the original dataframe\n",
    "    result[\"test_df\"] = result[\"df\"].loc[dates]\n",
    "    # remove duplicated dates in the testing dataframe\n",
    "    result[\"test_df\"] = result[\"test_df\"][~result[\"test_df\"].index.duplicated(keep='first')]\n",
    "    # remove dates from the training/testing sets & convert to float32\n",
    "    result[\"X_train\"] = result[\"X_train\"][:, :, :len(feature_columns)].astype(np.float32)\n",
    "    result[\"X_test\"] = result[\"X_test\"][:, :, :len(feature_columns)].astype(np.float32)\n",
    "\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_model(sequence_length, n_features, units=256, cell=LSTM, n_layers=2, dropout=0.3,\n",
    "                loss=\"mean_absolute_error\", optimizer=\"rmsprop\", bidirectional=False):\n",
    "    model = Sequential()\n",
    "    for i in range(n_layers):\n",
    "        if i == 0:\n",
    "            # first layer\n",
    "            if bidirectional:\n",
    "                model.add(Bidirectional(cell(units, return_sequences=True), batch_input_shape=(None, sequence_length, n_features)))\n",
    "            else:\n",
    "                model.add(cell(units, return_sequences=True, batch_input_shape=(None, sequence_length, n_features)))\n",
    "        elif i == n_layers - 1:\n",
    "            # last layer\n",
    "            if bidirectional:\n",
    "                model.add(Bidirectional(cell(units, return_sequences=False)))\n",
    "            else:\n",
    "                model.add(cell(units, return_sequences=False))\n",
    "        else:\n",
    "            # hidden layers\n",
    "            if bidirectional:\n",
    "                model.add(Bidirectional(cell(units, return_sequences=True)))\n",
    "            else:\n",
    "                model.add(cell(units, return_sequences=True))\n",
    "        # add dropout after each layer\n",
    "        model.add(Dropout(dropout))\n",
    "    model.add(Dense(1, activation=\"linear\"))\n",
    "    model.compile(loss=loss, metrics=[\"mean_absolute_error\"], optimizer=optimizer)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "outputPrepend"
    ]
   },
   "outputs": [],
   "source": [
    "# create these folders if they does not exist\n",
    "if not os.path.isdir(\"results\"):\n",
    "    os.mkdir(\"results\")\n",
    "\n",
    "if not os.path.isdir(\"logs\"):\n",
    "    os.mkdir(\"logs\")\n",
    "\n",
    "if not os.path.isdir(\"data\"):\n",
    "    os.mkdir(\"data\")\n",
    "\n",
    "# load the data\n",
    "data = load_data(ticker, N_STEPS, scale=SCALE, split_by_date=SPLIT_BY_DATE, \n",
    "                shuffle=SHUFFLE, lookup_step=LOOKUP_STEP, test_size=TEST_SIZE, \n",
    "                feature_columns=FEATURE_COLUMNS)\n",
    "\n",
    "# save the dataframe\n",
    "data[\"df\"].to_csv(ticker_data_filename)\n",
    "\n",
    "# construct the model\n",
    "model = create_model(N_STEPS, len(FEATURE_COLUMNS), loss=LOSS, units=UNITS, cell=CELL, n_layers=N_LAYERS,\n",
    "                    dropout=DROPOUT, optimizer=OPTIMIZER, bidirectional=BIDIRECTIONAL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some tensorflow callbacks\n",
    "checkpointer = ModelCheckpoint(os.path.join(\"results\", model_name + \".h5\"), save_weights_only=True, save_best_only=True, verbose=1)\n",
    "tensorboard = TensorBoard(log_dir=os.path.join(\"logs\", model_name))\n",
    "# train the model and save the weights whenever we see \n",
    "# a new optimal model using ModelCheckpoint\n",
    "history = model.fit(data[\"X_train\"], data[\"y_train\"],\n",
    "                    batch_size=BATCH_SIZE,\n",
    "                    epochs=EPOCHS,\n",
    "                    validation_data=(data[\"X_test\"], data[\"y_test\"]),\n",
    "                    callbacks=[checkpointer, tensorboard],\n",
    "                    verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def plot_graph(test_df):\n",
    "    \"\"\"\n",
    "    This function plots true close price along with predicted close price\n",
    "    with blue and red colors respectively\n",
    "    \"\"\"\n",
    "    plt.plot(test_df[f'true_adjclose_{LOOKUP_STEP}'], c='b')\n",
    "    plt.plot(test_df[f'adjclose_{LOOKUP_STEP}'], c='r')\n",
    "    plt.xlabel(\"Days\")\n",
    "    plt.ylabel(\"Price\")\n",
    "    plt.legend([\"Actual Price\", \"Predicted Price\"])\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_final_df(model, data):\n",
    "    \"\"\"\n",
    "    This function takes the `model` and `data` dict to \n",
    "    construct a final dataframe that includes the features along \n",
    "    with true and predicted prices of the testing dataset\n",
    "    \"\"\"\n",
    "    # if predicted future price is higher than the current, \n",
    "    # then calculate the true future price minus the current price, to get the buy profit\n",
    "    buy_profit  = lambda current, pred_future, true_future: true_future - current if pred_future > current else 0\n",
    "    # if the predicted future price is lower than the current price,\n",
    "    # then subtract the true future price from the current price\n",
    "    sell_profit = lambda current, pred_future, true_future: current - true_future if pred_future < current else 0\n",
    "    X_test = data[\"X_test\"]\n",
    "    y_test = data[\"y_test\"]\n",
    "    # perform prediction and get prices\n",
    "    y_pred = model.predict(X_test)\n",
    "    if SCALE:\n",
    "        y_test = np.squeeze(data[\"column_scaler\"][\"adjclose\"].inverse_transform(np.expand_dims(y_test, axis=0)))\n",
    "        y_pred = np.squeeze(data[\"column_scaler\"][\"adjclose\"].inverse_transform(y_pred))\n",
    "    test_df = data[\"test_df\"]\n",
    "    # add predicted future prices to the dataframe\n",
    "    test_df[f\"adjclose_{LOOKUP_STEP}\"] = y_pred\n",
    "    # add true future prices to the dataframe\n",
    "    test_df[f\"true_adjclose_{LOOKUP_STEP}\"] = y_test\n",
    "    # sort the dataframe by date\n",
    "    test_df.sort_index(inplace=True)\n",
    "    final_df = test_df\n",
    "    # add the buy profit column\n",
    "    final_df[\"buy_profit\"] = list(map(buy_profit, \n",
    "                                    final_df[\"adjclose\"], \n",
    "                                    final_df[f\"adjclose_{LOOKUP_STEP}\"], \n",
    "                                    final_df[f\"true_adjclose_{LOOKUP_STEP}\"])\n",
    "                                    # since we don't have profit for last sequence, add 0's\n",
    "                                    )\n",
    "    # add the sell profit column\n",
    "    final_df[\"sell_profit\"] = list(map(sell_profit, \n",
    "                                    final_df[\"adjclose\"], \n",
    "                                    final_df[f\"adjclose_{LOOKUP_STEP}\"], \n",
    "                                    final_df[f\"true_adjclose_{LOOKUP_STEP}\"])\n",
    "                                    # since we don't have profit for last sequence, add 0's\n",
    "                                    )\n",
    "    return final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(model, data):\n",
    "    # retrieve the last sequence from data\n",
    "    last_sequence = data[\"last_sequence\"][-N_STEPS:]\n",
    "    # expand dimension\n",
    "    last_sequence = np.expand_dims(last_sequence, axis=0)\n",
    "    # get the prediction (scaled from 0 to 1)\n",
    "    prediction = model.predict(last_sequence)\n",
    "    # get the price (by inverting the scaling)\n",
    "    if SCALE:\n",
    "        predicted_price = data[\"column_scaler\"][\"adjclose\"].inverse_transform(prediction)[0][0]\n",
    "    else:\n",
    "        predicted_price = prediction[0][0]\n",
    "    return predicted_price"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load optimal model weights from results folder\n",
    "model_path = os.path.join(\"results\", model_name) + \".h5\"\n",
    "model.load_weights(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the model\n",
    "loss, mae = model.evaluate(data[\"X_test\"], data[\"y_test\"], verbose=0)\n",
    "# calculate the mean absolute error (inverse scaling)\n",
    "if SCALE:\n",
    "    mean_absolute_error = data[\"column_scaler\"][\"adjclose\"].inverse_transform([[mae]])[0][0]\n",
    "else:\n",
    "    mean_absolute_error = mae"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the final dataframe for the testing set\n",
    "final_df = get_final_df(model, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict the future price\n",
    "future_price = predict(model, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we calculate the accuracy by counting the number of positive profits\n",
    "accuracy_score = (len(final_df[final_df['sell_profit'] > 0]) + len(final_df[final_df['buy_profit'] > 0])) / len(final_df)\n",
    "# calculating total buy & sell profit\n",
    "total_buy_profit  = final_df[\"buy_profit\"].sum()\n",
    "total_sell_profit = final_df[\"sell_profit\"].sum()\n",
    "# total profit by adding sell & buy together\n",
    "total_profit = total_buy_profit + total_sell_profit\n",
    "# dividing total profit by number of testing samples (number of trades)\n",
    "profit_per_trade = total_profit / len(final_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# printing metrics\n",
    "print(f\"Future price after {LOOKUP_STEP} days is {future_price:.2f}$\")\n",
    "print(f\"{LOSS} loss:\", loss)\n",
    "print(\"Mean Absolute Error:\", mean_absolute_error)\n",
    "print(\"Accuracy score:\", accuracy_score)\n",
    "print(\"Total buy profit:\", total_buy_profit)\n",
    "print(\"Total sell profit:\", total_sell_profit)\n",
    "print(\"Total profit:\", total_profit)\n",
    "print(\"Profit per trade:\", profit_per_trade)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot true/pred prices graph\n",
    "plot_graph(final_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.tail(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the final dataframe to csv-results folder\n",
    "csv_results_folder = \"csv-results\"\n",
    "if not os.path.isdir(csv_results_folder):\n",
    "    os.mkdir(csv_results_folder)\n",
    "csv_filename = os.path.join(csv_results_folder, model_name + \".csv\")\n",
    "final_df.to_csv(csv_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}